In [1]:
# Computations
import numpy as np
import pandas as pd
import re

# Visualisation libraries

## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output

## missingno
import missingno as msno

## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})

## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline

## plotly
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
%config InlineBackend.figure_format = 'retina' 

import warnings
warnings.filterwarnings("ignore")
Pima Indians Diabetes Dataset

In this article, we use Kaggle'sPima Indians Diabetes. The Pima indians are a group of Native Americans living in an area consisting of what is now central and southern Arizona. A variety of statistical methods are used here for predictions.

Context

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

Content

The datasets consist of several medical predictor variables and one target variable, Outcome. Predictor variables include the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.

Preprocessing

In [2]:
Path = 'pima-indians-diabetes-database/diabetes.csv'
Data = pd.read_csv(Path)
Data.columns = [re.sub(r"(\w)([A-Z])", r"\1 \2", x).replace('B MI','BMI') for x in Data.columns]
display(Data.head())
display(pd.DataFrame({'Number of Instances': [Data.shape[0]], 'Number of Attributes': [Data.shape[1]]}).style.hide_index())
Pregnancies Glucose Blood Pressure Skin Thickness Insulin BMI Diabetes Pedigree Function Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
Number of Instances Number of Attributes
768 9
Feature Explanations
Pregnancies Number of times pregnant
Glucose Plasma glucose concentration a 2 hours in an oral glucose tolerance test
BloodPressure Diastolic blood pressure (mm Hg)
SkinThickness Triceps skinfold thickness (mm)
Insulin 2-Hour serum insulin (mu U/ml)
BMI Body mass index (weight in kg/(height in m)^2)
DiabetesPedigreeFunction Diabetes pedigree function
Age Age (years)
Outcome Whether or not a patient has diabetes
In [3]:
def Data_info(Inp, Only_NaN = False):
    Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
    Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
    Out ['Size'] = Inp.shape[0]
    Out['Percentage'] = 100 - np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
    Out.index.name = 'Features'
    Out['Data Type'] = Out['Data Type'].astype(str)
    if Only_NaN:
        Out = Out.loc[Out['Number of NaN Values']>0]
    return Out

data_info = Data_info(Data).reset_index(drop = False)
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type', text = 'Data Type',
            color_discrete_sequence = ['PaleGreen', 'LightBlue', 'PeachPuff'], hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1, y=.5, traceorder="normal",
                                                     bordercolor="DarkGray", borderwidth=1), width = 700)
fig.update_traces(texttemplate= 6*' ' + '%{label}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
fig.show()

Let's take a close look at our data.

In [4]:
fig, ax = plt.subplots(nrows=4, ncols=2, figsize = (16, 20))
ax = ax.ravel()
for i in range(len(Data.columns[:-1])):
    sns.distplot(Data.iloc[:,i], rug=True, rug_kws={"color": "red"},
                 kde_kws={"color": "k", "lw": 2, "label": "KDE"}, hist_kws={"histtype": "step", "linewidth": 2,
                                                                            "alpha": 1, "color": "Navy"}, ax= ax[i])
In [5]:
Temp = ['Non-Diabetic' if x==0 else 'Diabetic' for x in Data['Outcome']]
fig = go.Figure(data=go.Splom(dimensions=[dict(label='Pregnancies', values=Data['Pregnancies']),
                              dict(label='Glucose', values=Data['Glucose']),
                              dict(label='Blood<br>Pressure', values=Data['Blood Pressure']),
                              dict(label='Skin<br>Thickness', values=Data['Skin Thickness']),
                              dict(label='Insulin', values=Data['Insulin']),
                              dict(label='BMI', values=Data['BMI']),
                              dict(label='Diabetes<br>Pedigree<br>Fun', values=Data['Diabetes Pedigree Function']),
                              dict(label='Age', values=Data['Age'])],
                              showupperhalf=False,
                              marker=dict(color=Data['Outcome'], size=4, colorscale='Bluered',
                              line=dict(width=0.4, color='black')),
                              text=Temp, diagonal=dict(visible=False)))
del Temp
fig.update_layout(title='Scatterplot Matrix', dragmode='select',
                  width=900, height=900, hovermode='closest')
fig.show()

Normalizing the Data

As can be seen, the Data has a normal distribution, and some entries need to be adjusted. In doing so, we defined a normalizer as follows, for a given vector $x$,

\begin{align*} \text{Normalizer}(x, cut) = \begin{cases} x_i &\mbox{if } |x_i- \mu|<\sigma\times cut \\ mode(x) & \mbox{else} \end{cases}. \end{align*}
In [6]:
def Normalizer(Col, cut = 3):
    return Col[(Col > (Col.mean() - Col.std() * cut)) &
               (Col < (Col.mean() + Col.std() * cut))]

# Normalized Data
df = Data.copy()

fig, ax = plt.subplots(nrows=4, ncols=2, figsize = (16, 20))
ax = ax.ravel()

for i in range(len(df.columns[:-1])):
    df[df.columns[i]] = Normalizer(Data[Data.columns[i]])
    df[df.columns[i]] = df[df.columns[i]].fillna(df[df.columns[i]].dropna().mode()[0])
    # Sub-Plots
    sns.distplot(df.iloc[:,i], rug=True, rug_kws={"color": "red"}, kde_kws={"color": "k", "lw": 2, "label": "KDE"},
                 hist_kws={"histtype": "step", "linewidth": 2, "alpha": 1, "color": "Navy"}, ax= ax[i])

Basically, we diminished the influence of certain data points (see the following figure).

In [7]:
Temp = Data.copy()
Temp.iloc[:,:-1] = abs(Data.iloc[:,:-1] - df.iloc[:,:-1])

Temp0 = ['Non-Diabetic' if x==0 else 'Diabetic' for x in Temp['Outcome']]

fig = go.Figure(data=go.Splom(dimensions=[dict(label='Pregnancies', values=Temp['Pregnancies']),
                              dict(label='Glucose', values=Temp['Glucose']),
                              dict(label='Blood<br>Pressure', values=Temp['Blood Pressure']),
                              dict(label='Skin<br>Thickness', values=Temp['Skin Thickness']),
                              dict(label='Insulin', values=Temp['Insulin']),
                              dict(label='BMI', values=Temp['BMI']),
                              dict(label='Diabetes<br>Pedigree<br>Fun', values=Temp['Diabetes Pedigree Function']),
                              dict(label='Age', values=Temp['Age'])],
                              showupperhalf=False,
                              marker=dict(color=Temp['Outcome'], size=4, colorscale='Bluered',
                              line=dict(width=0.4, color='black')),
                              text=Temp0, diagonal=dict(visible=False)))
del Temp, Temp0
fig.update_layout(title='Scatterplot Matrix', dragmode='select',
                  width=900, height=900, hovermode='closest')
fig.show()

Data Correlation

In [8]:
def Correlation_Plot (Df,Fig_Size):
    Correlation_Matrix = Df.corr().round(2)
    mask = np.zeros_like(Correlation_Matrix)
    mask[np.triu_indices_from(mask)] = True
    for i in range(len(mask)):
        mask[i,i]=0
    Fig, ax = plt.subplots(figsize=(Fig_Size,Fig_Size))
    sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True, 
                annot_kws={"size": 12},
                cmap =sns.color_palette("RdYlGn", n_colors=10), linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": .7})
    bottom, top = ax.get_ylim()

Correlation_Plot (df, 9)

Export to a CSV file

In [9]:
Temp = Path.split('.')
Temp = Temp[0] + '_mod.' + Temp[1]
df.to_csv (Temp, index = None, header=True)